# coding=utf-8

import requests
import urllib.parse
from bs4 import BeautifulSoup
import csv
from lxml import etree
import time


class LianJiaSpider():
    def __init__(self, pages=10, city='北京'):
        self.pages = pages
        self.city = city
        self.result_city = []
        self.city_url = "https://www.lianjia.com/city/"
        self.city_list = [{'city': '安庆', 'href': 'https://aq.lianjia.com/'}, {'city': '滁州', 'href': 'https://cz.fang.lianjia.com/'}, {'city': '阜阳', 'href': 'https://fy.lianjia.com/'}, {'city': '合肥', 'href': 'https://hf.lianjia.com/'}, {'city': '马鞍山', 'href': 'https://mas.lianjia.com/'}, {'city': '芜湖', 'href': 'https://wuhu.lianjia.com/'}, {'city': '北京', 'href': 'https://bj.lianjia.com/'}, {'city': '重庆', 'href': 'https://cq.lianjia.com/'}, {'city': '福州', 'href': 'https://fz.lianjia.com/'}, {'city': '泉州', 'href': 'https://quanzhou.lianjia.com/'}, {'city': '厦门', 'href': 'https://xm.lianjia.com/'}, {'city': '漳州', 'href': 'https://zhangzhou.lianjia.com/'}, {'city': '东莞', 'href': 'https://dg.lianjia.com/'}, {'city': '佛山', 'href': 'https://fs.lianjia.com/'}, {'city': '广州', 'href': 'https://gz.lianjia.com/'}, {'city': '惠州', 'href': 'https://hui.lianjia.com/'}, {'city': '江门', 'href': 'https://jiangmen.lianjia.com/'}, {'city': '清远', 'href': 'https://qy.lianjia.com/'}, {'city': '深圳', 'href': 'https://sz.lianjia.com/'}, {'city': '珠海', 'href': 'https://zh.lianjia.com/'}, {'city': '湛江', 'href': 'https://zhanjiang.lianjia.com/'}, {'city': '中山', 'href': 'https://zs.lianjia.com/'}, {'city': '北海', 'href': 'https://bh.lianjia.com/'}, {'city': '防城港', 'href': 'https://fcg.lianjia.com/'}, {'city': '桂林', 'href': 'https://gl.lianjia.com/'}, {'city': '柳州', 'href': 'https://liuzhou.lianjia.com/'}, {'city': '南宁', 'href': 'https://nn.lianjia.com/'}, {'city': '兰州', 'href': 'https://lz.lianjia.com/'}, {'city': '天水', 'href': 'https://tianshui.lianjia.com/'}, {'city': '贵阳', 'href': 'https://gy.lianjia.com/'}, {'city': '保定', 'href': 'https://bd.lianjia.com/'}, {'city': '邯郸', 'href': 'https://hd.lianjia.com/'}, {'city': '廊坊', 'href': 'https://lf.lianjia.com/'}, {'city': '秦皇岛', 'href': 'https://qhd.fang.lianjia.com/'}, {'city': '石家庄', 'href': 'https://sjz.lianjia.com/'}, {'city': '唐山', 'href': 'https://ts.lianjia.com/'}, {'city': '张家口', 'href': 'https://zjk.lianjia.com/'}, {'city': '保亭', 'href': 'https://bt.fang.lianjia.com/'}, {'city': '澄迈', 'href': 'https://cm.lianjia.com/'}, {'city': '儋州', 'href': 'https://dz.fang.lianjia.com/'}, {'city': '海口', 'href': 'https://hk.lianjia.com/'}, {'city': '临高', 'href': 'https://lg.fang.lianjia.com/'}, {'city': '乐东', 'href': 'https://ld.fang.lianjia.com/'}, {'city': '陵水', 'href': 'https://ls.fang.lianjia.com/'}, {'city': '琼海', 'href': 'https://qh.fang.lianjia.com/'}, {'city': '三亚', 'href': 'https://san.lianjia.com/'}, {'city': '五指山', 'href': 'https://wzs.fang.lianjia.com/'}, {'city': '文昌', 'href': 'https://wc.fang.lianjia.com/'}, {'city': '万宁', 'href': 'https://wn.fang.lianjia.com/'}, {'city': '长沙', 'href': 'https://cs.lianjia.com/'}, {'city': '常德', 'href': 'https://changde.lianjia.com/'}, {'city': '湘西', 'href': 'https://xx.lianjia.com/'}, {'city': '岳阳', 'href': 'https://yy.lianjia.com/'}, {'city': '株洲', 'href': 'https://zhuzhou.lianjia.com/'}, {'city': '开封', 'href': 'https://kf.lianjia.com/'}, {'city': '洛阳', 'href': 'https://luoyang.lianjia.com/'}, {'city': '三门峡', 'href': 'https://smx.fang.lianjia.com/'}, {'city': '新乡', 'href': 'https://xinxiang.lianjia.com/'}, {'city': '许昌', 'href': 'https://xc.lianjia.com/'}, {'city': '郑州', 'href': 'https://zz.lianjia.com/'}, {'city': '周口', 'href': 'https://zk.lianjia.com/'}, {'city': '驻马店', 'href': 'https://zmd.lianjia.com/'}, {'city': '鄂州', 'href': 'https://ez.lianjia.com/'}, {'city': '黄石', 'href': 'https://huangshi.lianjia.com/'}, {'city': '武汉', 'href': 'https://wh.lianjia.com/'}, {'city': '襄阳', 'href': 'https://xy.lianjia.com/'}, {'city': '宜昌', 'href': 'https://yichang.lianjia.com/'}, {'city': '哈尔滨', 'href': 'https://hrb.lianjia.com/'}, {'city': '赣州', 'href': 'https://ganzhou.lianjia.com/'}, {'city': '九江', 'href': 'https://jiujiang.lianjia.com/'}, {'city': '吉安', 'href': 'https://jian.lianjia.com/'}, {'city': '南昌', 'href': 'https://nc.lianjia.com/'}, {'city': '上饶', 'href': 'https://sr.lianjia.com/'}, {'city': '常州', 'href': 'https://changzhou.lianjia.com/'}, {'city': '常熟', 'href': 'https://changshu.lianjia.com/'}, {'city': '丹阳', 'href': 'https://danyang.lianjia.com/'}, {'city': '海门', 'href': 'https://haimen.lianjia.com/'}, {'city': '淮安', 'href': 'https://ha.lianjia.com/'}, {'city': '江阴', 'href': 'https://jy.lianjia.com/'}, {'city': '句容', 'href': 'https://jr.lianjia.com/'}, {'city': '昆山', 'href': 'https://ks.lianjia.com/'}, {'city': '南京', 'href': 'https://nj.lianjia.com/'}, {'city': '南通', 'href': 'https://nt.lianjia.com/'}, {'city': '苏州', 'href': 'https://su.lianjia.com/'}, {'city': '太仓', 'href': 'https://taicang.lianjia.com/'}, {'city': '无锡', 'href': 'https://wx.lianjia.com/'}, {'city': '徐州', 'href': 'https://xz.lianjia.com/'}, {'city': '盐城', 'href': 'https://yc.lianjia.com/'}, {'city': '镇江', 'href': 'https://zj.lianjia.com/'}, {'city': '长春', 'href': 'https://cc.lianjia.com/'}, {'city': '吉林', 'href': 'https://jl.lianjia.com/'}, {'city': '大连', 'href': 'https://dl.lianjia.com/'}, {'city': '丹东', 'href': 'https://dd.lianjia.com/'}, {'city': '抚顺', 'href': 'https://fushun.lianjia.com/'}, {'city': '沈阳', 'href': 'https://sy.lianjia.com/'}, {'city': '包头', 'href': 'https://baotou.lianjia.com/'}, {'city': '赤峰', 'href': 'https://cf.lianjia.com/'}, {'city': '呼和浩特', 'href': 'https://hhht.lianjia.com/'}, {'city': '银川', 'href': 'https://yinchuan.lianjia.com/'}, {'city': '菏泽', 'href': 'https://heze.lianjia.com/'}, {'city': '济南', 'href': 'https://jn.lianjia.com/'}, {'city': '济宁', 'href': 'https://jining.lianjia.com/'}, {'city': '临沂', 'href': 'https://linyi.lianjia.com/'}, {'city': '青岛', 'href': 'https://qd.lianjia.com/'}, {'city': '泰安', 'href': 'https://ta.lianjia.com/'}, {'city': '潍坊', 'href': 'https://wf.lianjia.com/'}, {'city': '威海', 'href': 'https://weihai.lianjia.com/'}, {'city': '烟台', 'href': 'https://yt.lianjia.com/'}, {'city': '淄博', 'href': 'https://zb.lianjia.com/'}, {'city': '成都', 'href': 'https://cd.lianjia.com/'}, {'city': '德阳', 'href': 'https://dy.lianjia.com/'}, {'city': '达州', 'href': 'https://dazhou.lianjia.com/'}, {'city': '广元', 'href': 'https://guangyuan.lianjia.com/'}, {'city': '乐山', 'href': 'https://leshan.fang.lianjia.com/'}, {'city': '凉山', 'href': 'https://liangshan.lianjia.com/'}, {'city': '绵阳', 'href': 'https://mianyang.lianjia.com/'}, {'city': '眉山', 'href': 'https://ms.fang.lianjia.com/'}, {'city': '南充', 'href': 'https://nanchong.lianjia.com/'}, {'city': '攀枝花', 'href': 'https://pzh.lianjia.com/'}, {'city': '遂宁', 'href': 'https://sn.lianjia.com/'}, {'city': '宜宾', 'href': 'https://yibin.lianjia.com/'}, {'city': '雅安', 'href': 'https://yaan.lianjia.com/'}, {'city': '宝鸡', 'href': 'https://baoji.lianjia.com/'}, {'city': '汉中', 'href': 'https://hanzhong.lianjia.com/'}, {'city': '西安', 'href': 'https://xa.lianjia.com/'}, {'city': '咸阳', 'href': 'https://xianyang.lianjia.com/'}, {'city': '晋中', 'href': 'https://jz.lianjia.com/'}, {'city': '太原', 'href': 'https://ty.lianjia.com/'}, {'city': '运城', 'href': 'https://yuncheng.lianjia.com/'}, {'city': '上海', 'href': 'https://sh.lianjia.com/'}, {'city': '天津', 'href': 'https://tj.lianjia.com/'}, {'city': '乌鲁木齐', 'href': 'https://wlmq.lianjia.com/'}, {'city': '大理', 'href': 'https://dali.lianjia.com/'}, {'city': '昆明', 'href': 'https://km.lianjia.com/'}, {'city': '西双版纳', 'href': 'https://xsbn.fang.lianjia.com/'}, {'city': '杭州', 'href': 'https://hz.lianjia.com/'}, {'city': '湖州', 'href': 'https://huzhou.lianjia.com/'}, {'city': '嘉兴', 'href': 'https://jx.lianjia.com/'}, {'city': '金华', 'href': 'https://jh.lianjia.com/'}, {'city': '宁波', 'href': 'https://nb.lianjia.com/'}, {'city': '衢州', 'href': 'https://quzhou.lianjia.com/'}, {'city': '绍兴', 'href': 'https://sx.lianjia.com/'}, {'city': '台州', 'href': 'https://taizhou.lianjia.com/'}, {'city': '温州', 'href': 'https://wz.lianjia.com/'}, {'city': '义乌', 'href': 'https://yw.lianjia.com/'}]
        self.url_list = []
        #
        search_result = self.is_have_city(self.city_list)
        if search_result:
            print('获取到地址信息:{}, 链接：{}'.format(self.city, search_result['href']))
            self.get_url( search_result['href'], self.pages, urllib.parse.quote(city) )
        else:
            raise Exception('没有找寻到相关地址，请重新输入!')

        self.f = open('{}-链家网.csv'.format(self.city), 'w', encoding='utf-8-sig', newline='')
        self.csv_file = csv.writer(self.f)
        self.csv_file.writerow(['区域','标题','小区','总价/万','单价','户型','面积','朝向','风格','楼层'])

        self.flag = False

    def get_lianjia_all(self):
        """ 获取链家所有城市 组成字典  {城市名称: 链接}"""
        response = requests.get(self.city_url)
        html = etree.HTML(response.text)
        city_list = html.xpath('//*[@class="city_list"]//li/a')
        if city_list:
            for city in city_list:
                href = city.xpath('./@href')
                city_name = city.xpath('string(.)')
                if href and city_name:
                    self.result_city.append({
                        "city": city_name,
                        "href": href[0]
                    })
    def get_url(self, city_url, pages, wd):
        """给定参数 遍历出所有得待爬取链接 """
        # "https://hui.lianjia.com/ershoufang/pg{}rs{}/".format(page, res)
        if  isinstance(pages,int): # 判断变量是否为 int类型
            for page in range(1, pages + 1):
                url = city_url + "ershoufang/pg{}rs{}/".format(page, wd)
                self.url_list.append(url)
        else:
            raise Exception('请输入int类型页数!') # 手动抛出异常
    def is_have_city(self, city_list):
        """ 判断是否有输入得城市"""
        for i in city_list:
            if self.city in i['city']:
                return i

    def requests_html(self,url):
        """请求地址"""
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            #请求成功
            return response
        else:
            raise Exception('请求异常!!')

    def run(self):
        for url in self.url_list:
            print('当前爬取页面: ',url)
            res = self.requests_html(url)
            self.get_parse(res.text)
            time.sleep(1)
            if self.flag:
                break # self.flag 条件成立，则证明已经获取不到数据

    def get_parse(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        # 获取定位这个标签
        div_list = soup.findAll('div', attrs={"class": "info clear"})
        if div_list:
            for div in div_list:
                # 定位房间标题
                div_title = div.find('div', attrs={"class": "title"})
                title = div_title.a.string.strip()

                #定位房间小区
                div_flood = div.find('div', attrs={"class":"flood"})
                a_list = div_flood.findAll('a')
                flood = a_list[0].string.strip()


                # 所在得哪个区
                address = a_list[-1].string.strip()


                # 定位房间户型
                div_housinfo = div.find('div', attrs={"class": "address"})
                div_housinfo = div_housinfo.text
                huxing = ''
                mianji = ''
                chaoxiang = ''
                fengge = ''
                louceng = ''
                if div_housinfo:
                    huxing = div_housinfo.split('|')[0].strip() #户型
                    mianji = div_housinfo.split('|')[1].strip() #面积
                    chaoxiang = div_housinfo.split('|')[2].strip() #朝向
                    fengge = div_housinfo.split('|')[3].strip() #装修风格  精装|毛胚
                    louceng = div_housinfo.split('|')[4].strip() #楼层

                # 总价
                div_totalPrice = div.find('div', attrs={"class":"totalPrice"})
                totalprice = div_totalPrice.span.string.strip()

                # 单价
                div_unitPrice = div.find('div', attrs={"class":"unitPrice"})
                unitPrice = div_unitPrice.span.string.strip()

                if title:

                    # self.csv_file.writerow([address, title, flood, totalprice, unitPrice])
                    # print('所在区域：{}，'
                    #       '房间位置：{}，'
                    #       '房间总价：{}，'
                    #       '房间单价：{}，'
                    #       '房间户型：{}，'
                    #       .format(address, title, flood, totalprice, unitPrice, ))


                    #写入文件
                    self.csv_file.writerow([address, title, flood, totalprice, unitPrice, huxing, mianji, chaoxiang, fengge, louceng])
                    # '区域','房间标题','小区','房间总价/万','房间单价','房间户型','房间面积','房间朝向','装修风格','房间楼层'
                    print('所在区域：{}，'
                          '房间名称：{}，'
                          '小区名称：{}，'
                          '房间总价/万：{}，'
                          '房间单价：{}，'
                          '房间户型：{}，'
                          '房间面积：{}，'
                          '房间朝向：{}，'
                          '装修风格：{}，'
                          '房间楼层：{}，'
                          .format(address, title, flood, totalprice, unitPrice, huxing, mianji, chaoxiang, fengge, louceng))
        else:
            print('未获取到房源信息')
            self.flag = True

if __name__ == '__main__':

    lj = LianJiaSpider(city='深圳', pages=100)
    lj.run()
